library(tidyverse)
library(visdat)
library(naniar)
library(broom)
library(labelled)
library(gridExtra)
data <- read_csv("Melbourne_housing_FULL.csv")
house <- data%>%select(Price,Rooms,Type,Distance,Bedroom2,Bathroom)
vis_dat(house, palette = "cb_safe")
vis_miss(house, sort_miss=TRUE) + theme(aspect.ratio=1)
s_miss <- miss_summary(house)
s_miss$miss_df_prop
## [1] 0.1150128
s_miss$miss_var_summary
## [[1]]
## # A tibble: 6 x 4
## variable n_miss pct_miss n_miss_cumsum
## <chr> <int> <dbl> <int>
## 1 Price 7610 21.832056689 7610
## 2 Rooms 0 0.000000000 7610
## 3 Type 0 0.000000000 7610
## 4 Distance 1 0.002868864 7611
## 5 Bedroom2 8217 23.573457268 15828
## 6 Bathroom 8226 23.599277046 24054
house_clean <- house%>%
filter(!is.na(Price))
house_clean%>%filter(is.na(Distance))
## # A tibble: 1 x 6
## Price Rooms Type Distance Bedroom2 Bathroom
## <int> <int> <chr> <dbl> <int> <int>
## 1 616000 3 h NA NA NA
house_clean <- house_clean%>%filter(!is.na(Distance))
s_miss_2 <- miss_summary(house_clean)
s_miss_2$miss_case_table
## [[1]]
## # A tibble: 3 x 3
## n_miss_in_case n_cases pct_miss
## <int> <int> <dbl>
## 1 0 20800 76.34148132
## 2 1 6 0.02202158
## 3 2 6440 23.63649710
s_miss_2$miss_var_summary
## [[1]]
## # A tibble: 6 x 4
## variable n_miss pct_miss n_miss_cumsum
## <chr> <int> <dbl> <int>
## 1 Price 0 0.00000 0
## 2 Rooms 0 0.00000 0
## 3 Type 0 0.00000 0
## 4 Distance 0 0.00000 0
## 5 Bedroom2 6440 23.63650 6440
## 6 Bathroom 6446 23.65852 12886
It implies all the observations missing in “Bedroom2” also missing in “Bathroom” after cleaning “Price” and “Distance”. Only 6 observations missing in “Bathroom” didn’t miss in “Bedroom2”.
house_shadow <- bind_shadow(house_clean)
house_shadow
## # A tibble: 27,246 x 12
## Price Rooms Type Distance Bedroom2 Bathroom Price_NA Rooms_NA
## <int> <int> <chr> <dbl> <int> <int> <fctr> <fctr>
## 1 1480000 2 h 2.5 2 1 !NA !NA
## 2 1035000 2 h 2.5 2 1 !NA !NA
## 3 1465000 3 h 2.5 3 2 !NA !NA
## 4 850000 3 h 2.5 3 2 !NA !NA
## 5 1600000 4 h 2.5 3 1 !NA !NA
## 6 941000 2 h 2.5 2 1 !NA !NA
## 7 1876000 3 h 2.5 4 2 !NA !NA
## 8 1636000 2 h 2.5 2 1 !NA !NA
## 9 1000000 3 h 2.5 NA NA !NA !NA
## 10 745000 2 t 2.5 NA NA !NA !NA
## # ... with 27,236 more rows, and 4 more variables: Type_NA <fctr>,
## # Distance_NA <fctr>, Bedroom2_NA <fctr>, Bathroom_NA <fctr>
ggplot(data = house_shadow, aes(x = Distance, y=Rooms, colour=Bedroom2_NA)) +
scale_colour_brewer(palette="Dark2") +
facet_wrap(~Type)+
geom_point(alpha=0.7) + theme(aspect.ratio=1)
s_miss_group <- house_clean %>%
group_by(Type) %>% miss_summary()
s_miss_group$miss_case_table
## [[1]]
## # A tibble: 9 x 4
## Type n_miss_in_case n_cases pct_miss
## <chr> <int> <int> <dbl>
## 1 h 0 15728 85.14969412
## 2 h 1 3 0.01624168
## 3 h 2 2740 14.83406421
## 4 t 0 1579 55.09420796
## 5 t 1 2 0.06978367
## 6 t 2 1285 44.83600837
## 7 u 0 3493 59.11321713
## 8 u 1 1 0.01692334
## 9 u 2 2415 40.86985954
ggplot(house_clean,
aes(x = Distance,
y = Bathroom)) +
scale_colour_brewer(palette="Dark2") +
geom_miss_point()+
facet_wrap(~Type) +
theme(aspect.ratio=1)
ggplot(house_clean,
aes(x = Rooms,
y = Bathroom)) +
scale_colour_brewer(palette="Dark2") +
geom_miss_point()+
facet_wrap(~Type) +
theme(aspect.ratio=1)
library(impute)
house_impute_h <- house_shadow %>%
arrange(Type, Rooms, Bathroom) %>%
filter(Type=="h") %>%
select(Rooms, Bathroom)
house_impute_h <- impute.knn(as.matrix(house_impute_h), 10)
house_impute_t <- house_shadow %>%
arrange(Type, Rooms, Bathroom) %>%
filter(Type=="t") %>%
select(Rooms, Bathroom)
house_impute_t <- impute.knn(as.matrix(house_impute_t), 10)
house_impute_u <- house_shadow %>%
arrange(Type, Rooms, Bathroom) %>%
filter(Type=="u") %>%
select(Rooms, Bathroom)
house_impute_u <- impute.knn(as.matrix(house_impute_u), 10)
house_impute <- rbind(house_impute_h, house_impute_t$data, house_impute_u)
house_shadow_2 <- house_shadow %>%
arrange(Type, Rooms, Bathroom)
house_shadow_2 <- house_shadow_2%>%
mutate(Rooms = house_impute$Rooms,
Bathroom = house_impute$Bathroom)
ggplot(house_shadow_2,
aes(x = Rooms,
y = Bathroom,
colour=Bathroom_NA)) +
geom_point(alpha=0.7) +
facet_wrap(~Type) +
scale_colour_brewer(palette="Dark2") +
theme(aspect.ratio=1)
Bath_h <- house_shadow %>%
filter(Type=="h") %>%
mutate(Bathroom = ifelse(is.na(Bathroom),
mean(Bathroom, na.rm=TRUE),
Bathroom))
Bath_t <- house_shadow %>%
filter(Type=="t") %>%
mutate(Bathroom = ifelse(is.na(Bathroom),
mean(Bathroom, na.rm=TRUE),
Bathroom))
Bath_u <- house_shadow %>%
filter(Type=="u") %>%
mutate(Bathroom = ifelse(is.na(Bathroom),
mean(Bathroom, na.rm=TRUE),
Bathroom))
house_shadow_3 <- rbind(Bath_h, Bath_t, Bath_u)
ggplot(house_shadow_3,
aes(x = Rooms,
y = Bathroom,
colour=Bathroom_NA)) +
geom_point(alpha=0.7) +
facet_wrap(~Type) +
scale_colour_brewer(palette="Dark2") +
theme(aspect.ratio=1)
ggplot(house_shadow_3,
aes(x = Distance,
y = Bathroom,
colour=Bathroom_NA)) +
geom_point(alpha=0.7) +
facet_wrap(~Type) +
scale_colour_brewer(palette="Dark2") +
theme(aspect.ratio=1)
Bath_clean_h <- house_clean %>%
filter(Type=="h") %>%
mutate(Bathroom = ifelse(is.na(Bathroom),
mean(Bathroom, na.rm=TRUE),
Bathroom))
Bath_clean_t <- house_clean %>%
filter(Type=="t") %>%
mutate(Bathroom = ifelse(is.na(Bathroom),
mean(Bathroom, na.rm=TRUE),
Bathroom))
Bath_clean_u <- house_clean %>%
filter(Type=="u") %>%
mutate(Bathroom = ifelse(is.na(Bathroom),
mean(Bathroom, na.rm=TRUE),
Bathroom))
house_clean <- rbind(Bath_clean_h, Bath_clean_t, Bath_clean_u)
ggplot(house_clean,
aes(x = Rooms,
y = Bedroom2)) +
scale_colour_brewer(palette="Dark2") +
geom_miss_point()+
facet_wrap(~Type) +
theme(aspect.ratio=1)
ggplot(house_clean,
aes(x = Distance,
y = Bedroom2)) +
scale_colour_brewer(palette="Dark2") +
geom_miss_point()+
facet_wrap(~Type) +
theme(aspect.ratio=1)
Bed_h <- house_shadow %>%
filter(Type=="h") %>%
mutate(Bedroom2 = ifelse(is.na(Bedroom2),
mean(Bedroom2, na.rm=TRUE),
Bedroom2))
Bed_t <- house_shadow %>%
filter(Type=="t") %>%
mutate(Bedroom2 = ifelse(is.na(Bedroom2),
mean(Bedroom2, na.rm=TRUE),
Bedroom2))
Bed_u <- house_shadow %>%
filter(Type=="u") %>%
mutate(Bedroom2 = ifelse(is.na(Bedroom2),
mean(Bedroom2, na.rm=TRUE),
Bedroom2))
house_shadow_4 <- rbind(Bed_h, Bed_t, Bed_u)
ggplot(house_shadow_4,
aes(x = Rooms,
y = Bedroom2,
colour=Bedroom2_NA)) +
geom_point(alpha=0.7) +
facet_wrap(~Type) +
scale_colour_brewer(palette="Dark2") +
theme(aspect.ratio=1)
ggplot(house_shadow_4,
aes(x = Distance,
y = Bedroom2,
colour=Bedroom2_NA)) +
geom_point(alpha=0.7) +
facet_wrap(~Type) +
scale_colour_brewer(palette="Dark2") +
theme(aspect.ratio=1)
Bed_clean_h <- house_clean %>%
filter(Type=="h") %>%
mutate(Bedroom2 = ifelse(is.na(Bedroom2),
mean(Bedroom2, na.rm=TRUE),
Bedroom2))
Bed_clean_t <- house_clean %>%
filter(Type=="t") %>%
mutate(Bedroom2 = ifelse(is.na(Bedroom2),
mean(Bedroom2, na.rm=TRUE),
Bedroom2))
Bed_clean_u <- house_clean %>%
filter(Type=="u") %>%
mutate(Bedroom2 = ifelse(is.na(Bedroom2),
mean(Bedroom2, na.rm=TRUE),
Bedroom2))
house_clean <- rbind(Bed_clean_h, Bed_clean_t, Bed_clean_u)
house_clean$Type <- as.factor(house_clean$Type)
summary(house_clean)
## Price Rooms Type Distance
## Min. : 85000 Min. : 1.000 h:18471 Min. : 0.00
## 1st Qu.: 635000 1st Qu.: 2.000 t: 2866 1st Qu.: 6.40
## Median : 870000 Median : 3.000 u: 5909 Median :10.50
## Mean : 1050189 Mean : 2.992 Mean :11.28
## 3rd Qu.: 1295000 3rd Qu.: 4.000 3rd Qu.:14.00
## Max. :11200000 Max. :16.000 Max. :48.10
## Bedroom2 Bathroom
## Min. : 0.000 Min. :0.000
## 1st Qu.: 2.000 1st Qu.:1.000
## Median : 3.000 Median :1.656
## Mean : 2.969 Mean :1.574
## 3rd Qu.: 3.302 3rd Qu.:2.000
## Max. :20.000 Max. :9.000
str(house_clean)
## Classes 'tbl_df', 'tbl' and 'data.frame': 27246 obs. of 6 variables:
## $ Price : int 1480000 1035000 1465000 850000 1600000 941000 1876000 1636000 1000000 1097000 ...
## $ Rooms : int 2 2 3 3 4 2 3 2 3 2 ...
## $ Type : Factor w/ 3 levels "h","t","u": 1 1 1 1 1 1 1 1 1 1 ...
## $ Distance: num 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 ...
## $ Bedroom2: num 2 2 3 3 3 ...
## $ Bathroom: num 1 1 2 2 1 ...
ggplot(house_clean, mapping = aes(x=Rooms, y= Price)) + geom_point()
ggplot(house_clean, mapping = aes(x=Type, y= Price)) + geom_boxplot()
This plot suggests that houses have a larger price range than townhouses and units. The price range of townhouses is the smallest compared to the other types. Also, houses have a higher median price than townhouses, which have a higher median price than units.
ggplot(house_clean, mapping = aes(x=Distance, y= Price)) + geom_point()
This plot is positively skewed. This means that there is a more dense popluation of houses with smaller distances from the CBD (between 1km and 19km) than larger distances. In general, the further away the house is from the CBD, the less expensive. There are a few outliers. For example, there is a very expensive house that is only 10km form the CBD. This suggests that there must be another factor or factors affecting the price, such as size of the house or amount of land.
ggplot(house_clean, mapping = aes(x=Bedroom2, y= Price)) + geom_point()
Most of the properties have between 3-5 bedrooms. Houses with less than 3 bedrooms and more than 5 bedrooms are less expensive than those with 3-5 bedrooms
ggplot(house_clean, mapping = aes(x=Bathroom, y= Price)) + geom_point()
This plot is slightly positively skewed. This means that there is a higher number of houses with a lower number of bathrooms (1-4) than with a higher number of bathrooms (more than 6).
mod1_data <- house_clean%>%select(Price,Rooms,Type)
mod1_1 <- lm(Price~Rooms+Type, data = mod1_data)
mod1_2 <- lm(Price~Rooms*Type, data = mod1_data)
tidy(mod1_1)
## term estimate std.error statistic p.value
## 1 (Intercept) 360858.6 14809.522 24.36666 9.468044e-130
## 2 Rooms 253786.7 4282.258 59.26469 0.000000e+00
## 3 Typet -162362.5 11416.839 -14.22131 9.853357e-46
## 4 Typeu -244354.0 10079.434 -24.24283 1.810348e-128
tidy(mod1_2)
## term estimate std.error statistic p.value
## 1 (Intercept) 349978.3807 16629.967 21.04504344 1.519148e-97
## 2 Rooms 257062.6348 4850.401 52.99821904 0.000000e+00
## 3 Typet -162088.5943 48379.835 -3.35033373 8.082384e-04
## 4 Typeu -193276.6625 29201.844 -6.61864583 3.692319e-11
## 5 Rooms:Typet 398.4062 16064.164 0.02480093 9.802139e-01
## 6 Rooms:Typeu -23222.5920 12340.508 -1.88181821 5.987136e-02
a_mod1_1 <- augment(mod1_1, mod1_data)
ggplot(mod1_data, aes(x = Rooms, y = Price, color = Type))+
geom_point()+
facet_wrap(~Type)+
geom_line(data=a_mod1_1, aes(y=.fitted), color="Black")+
theme(aspect.ratio=1)+
ggtitle("Model 1")
a_mod1_2 <- augment(mod1_2, mod1_data)
ggplot(mod1_data, aes(x = Rooms, y = Price, color = Type))+
geom_point()+
facet_wrap(~Type)+
geom_line(data=a_mod1_2, aes(y=.fitted), color="Black")+
theme(aspect.ratio=1)+
ggtitle("Model 2")
p1 <- ggplot(a_mod1_1, aes(x=Rooms, y=.fitted,
colour=Type)) +
geom_line()+
ggtitle("Model 1")
p2 <- ggplot(a_mod1_2, aes(x=Rooms, y=.fitted,
colour=Type)) +
geom_line()+
ggtitle("Model 2")
grid.arrange(p1, p2, ncol=2)
rbind(glance(mod1_1),glance(mod1_2))
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.2350728 0.2349886 561064.4 2790.615 0 4 -399329.8 798669.6
## 2 0.2351740 0.2350336 561047.9 1675.189 0 6 -399328.0 798670.0
## BIC deviance df.residual
## 1 798710.7 8.575597e+15 27242
## 2 798727.5 8.574463e+15 27240
The model 1 and 2 build base on “Rooms” and “Type”. R.squared and adj.r.squared slightly increases in model2 but AIC and BIC increases in Model1. Therefore, the model1 without interaction is better.
mod2_data <- house_clean%>%select(Price,Distance,Type)
mod2_1 <- lm(Price~Distance+Type, data = mod2_data)
mod2_2 <- lm(Price~Distance*Type, data = mod2_data)
tidy(mod2_1)
## term estimate std.error statistic p.value
## 1 (Intercept) 1579224.08 7605.880 207.63200 0.000000e+00
## 2 Distance -30408.93 517.161 -58.79973 0.000000e+00
## 3 Typet -325103.48 11309.362 -28.74640 4.744744e-179
## 4 Typeu -700014.11 8654.707 -80.88247 0.000000e+00
tidy(mod2_2)
## term estimate std.error statistic p.value
## 1 (Intercept) 1657355.33 8159.7957 203.11236 0.000000e+00
## 2 Distance -36736.63 572.0076 -64.22402 0.000000e+00
## 3 Typet -637679.99 26393.9655 -24.16007 1.290829e-127
## 4 Typeu -996862.14 15582.0299 -63.97511 0.000000e+00
## 5 Distance:Typet 28396.66 2226.3230 12.75496 3.737148e-37
## 6 Distance:Typeu 32797.37 1464.1195 22.40075 3.829478e-110
a_mod2_1 <- augment(mod2_1, mod2_data)
ggplot(mod2_data, aes(x = Distance, y = Price, color = Type))+
geom_point()+
facet_wrap(~Type)+
geom_line(data=a_mod2_1, aes(y=.fitted), color="Black")+
theme(aspect.ratio=1)+
ggtitle("Model 3")
a_mod2_2 <- augment(mod2_2, mod2_data)
ggplot(mod2_data, aes(x = Distance, y = Price, color = Type))+
geom_point()+
facet_wrap(~Type)+
geom_line(data=a_mod2_2, aes(y=.fitted), color="Black")+
theme(aspect.ratio=1)+
ggtitle("Model 4")
p3 <- ggplot(a_mod2_1, aes(x=Distance, y=.fitted,
colour=Type)) +
geom_line()+
ggtitle("Model 3")
p4 <- ggplot(a_mod2_2, aes(x=Distance, y=.fitted,
colour=Type)) +
geom_line()+
ggtitle("Model 4")
grid.arrange(p3, p4, ncol=2)
rbind(glance(mod2_1),glance(mod2_2))
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.233705 0.2336206 561565.8 2769.426 0 4 -399354.1 798718.3
## 2 0.250578 0.2504405 555369.2 1821.603 0 6 -399050.8 798115.6
## BIC deviance df.residual
## 1 798759.3 8.590932e+15 27242
## 2 798173.1 8.401768e+15 27240
The model 3 and 4 build base on “Distance” and “Type”. R.squared and adj.r.squared increases in model4, also AIC and BIC decrease in Model4. Therefore, the model4 with interaction is better.
mod3_data <- house_clean%>%select(Price,Bedroom2,Type)
mod3_1 <- lm(Price~Bedroom2+Type, data = mod3_data)
mod3_2 <- lm(Price~Bedroom2*Type, data = mod3_data)
tidy(mod3_1)
## term estimate std.error statistic p.value
## 1 (Intercept) 431491.4 16549.297 26.07310 4.832269e-148
## 2 Bedroom2 233897.5 4847.452 48.25165 0.000000e+00
## 3 Typet -168967.7 11686.731 -14.45808 3.335091e-47
## 4 Typeu -266924.9 10682.262 -24.98768 2.855176e-136
tidy(mod3_2)
## term estimate std.error statistic p.value
## 1 (Intercept) 430412.67 17892.620 24.0553184 1.536410e-126
## 2 Bedroom2 234224.27 5267.057 44.4696673 0.000000e+00
## 3 Typet -203696.26 63854.878 -3.1899875 1.424401e-03
## 4 Typeu -247329.95 35994.023 -6.8714173 6.493532e-12
## 5 Bedroom2:Typet 12200.70 21763.499 0.5606037 5.750723e-01
## 6 Bedroom2:Typeu -9673.09 16191.124 -0.5974317 5.502242e-01
a_mod3_1 <- augment(mod3_1, mod3_data)
ggplot(mod3_data, aes(x = Bedroom2, y = Price, color = Type))+
geom_point()+
facet_wrap(~Type)+
geom_line(data=a_mod3_1, aes(y=.fitted), color="Black")+
theme(aspect.ratio=1)+
ggtitle("Model 5")
a_mod3_2 <- augment(mod3_2, mod3_data)
ggplot(mod3_data, aes(x = Bedroom2, y = Price, color = Type))+
geom_point()+
facet_wrap(~Type)+
geom_line(data=a_mod3_2, aes(y=.fitted), color="Black")+
theme(aspect.ratio=1)+
ggtitle("Model 6")
p5 <- ggplot(a_mod3_1, aes(x=Bedroom2, y=.fitted,
colour=Type)) +
geom_line()+
ggtitle("Model 5")
p6 <- ggplot(a_mod3_2, aes(x=Bedroom2, y=.fitted,
colour=Type)) +
geom_line()+
ggtitle("Model 6")
grid.arrange(p5, p6, ncol=2)
rbind(glance(mod3_1),glance(mod3_2))
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.2044428 0.2043552 572187.5 2333.555 0 4 -399864.7 799739.3
## 2 0.2044640 0.2043180 572200.8 1400.213 0 6 -399864.3 799742.6
## BIC deviance df.residual
## 1 799780.4 8.918990e+15 27242
## 2 799800.1 8.918752e+15 27240
The model 5 and 6 build base on “Bedroom2” and “Type”. R.squared and adj.r.squared slightly increases in model6 but AIC and BIC increases in Model6. Therefore, the model5 without interaction is better.
mod4_data <- house_clean%>%select(Price,Bathroom,Type)
mod4_1 <- lm(Price~Bathroom+Type, data = mod4_data)
mod4_2 <- lm(Price~Bathroom*Type, data = mod4_data)
tidy(mod4_1)
## term estimate std.error statistic p.value
## 1 (Intercept) 620245.0 10354.965 59.89831 0.000000e+00
## 2 Bathroom 352284.8 5737.873 61.39642 0.000000e+00
## 3 Typet -337960.8 11267.569 -29.99412 1.705965e-194
## 4 Typeu -410646.1 8773.218 -46.80678 0.000000e+00
tidy(mod4_2)
## term estimate std.error statistic p.value
## 1 (Intercept) 607539.64 10965.704 55.403613 0.000000e+00
## 2 Bathroom 359955.53 6137.748 58.646193 0.000000e+00
## 3 Typet -205305.58 44897.026 -4.572810 4.833423e-06
## 4 Typeu -349051.68 29983.711 -11.641377 3.010710e-31
## 5 Bathroom:Typet -72801.70 23757.607 -3.064353 2.183544e-03
## 6 Bathroom:Typeu -48839.83 23504.652 -2.077879 3.772983e-02
a_mod4_1 <- augment(mod4_1, mod4_data)
ggplot(mod4_data, aes(x = Bathroom, y = Price, color = Type))+
geom_point()+
facet_wrap(~Type)+
geom_line(data=a_mod4_1, aes(y=.fitted), color="Black")+
theme(aspect.ratio=1)+
ggtitle("Model 7")
a_mod4_2 <- augment(mod4_2, mod4_data)
ggplot(mod4_data, aes(x = Bathroom, y = Price, color = Type))+
geom_point()+
facet_wrap(~Type)+
geom_line(data=a_mod4_2, aes(y=.fitted), color="Black")+
theme(aspect.ratio=1)+
ggtitle("Model 8")
p7 <- ggplot(a_mod4_1, aes(x=Bathroom, y=.fitted,
colour=Type)) +
geom_line()+
ggtitle("Model 7")
p8 <- ggplot(a_mod4_2, aes(x=Bathroom, y=.fitted,
colour=Type)) +
geom_line()+
ggtitle("Model 8")
grid.arrange(p7, p8, ncol=2)
rbind(glance(mod4_1),glance(mod4_2))
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.2414173 0.2413337 558732.7 2889.902 0 4 -399216.3 798442.7
## 2 0.2417766 0.2416374 558620.9 1737.217 0 6 -399209.9 798433.8
## BIC deviance df.residual
## 1 798483.7 8.504469e+15 27242
## 2 798491.3 8.500441e+15 27240
The model 7 and 8 build base on “Bthroom” and “Type”. R.squared and adj.r.squared slightly increases in model8. And AIC decreases but BIC increases in model8. Therefore, model8 with interaction is better.
mod9 <- lm(Price~Type+Rooms+Bedroom2+Distance*Type+Bathroom*Type, data = house_clean)
tidy(mod9)
## term estimate std.error statistic p.value
## 1 (Intercept) 663264.48 15006.4190 44.198718 0.000000e+00
## 2 Typet -426691.60 43218.0747 -9.872990 5.957678e-23
## 3 Typeu -546583.56 28745.2765 -19.014726 4.259709e-80
## 4 Rooms 259856.84 6736.8833 38.572264 5.846387e-317
## 5 Bedroom2 -73893.54 7844.2773 -9.420057 4.853471e-21
## 6 Distance -42916.18 506.5156 -84.728237 0.000000e+00
## 7 Bathroom 272476.13 6484.9290 42.016826 0.000000e+00
## 8 Typet:Distance 27429.38 1950.8533 14.060197 9.566318e-45
## 9 Typeu:Distance 31833.69 1284.0267 24.792079 3.362211e-134
## 10 Typet:Bathroom -98648.54 20679.5595 -4.770340 1.848666e-06
## 11 Typeu:Bathroom -82537.18 20420.7129 -4.041836 5.317990e-05
glance(mod9)
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.4282488 0.4280389 485134.3 2039.936 0 11 -395364.5 790752.9
## BIC deviance df.residual
## 1 790851.5 6.409901e+15 27235
mod10 <- lm(Price~Type+Rooms+Bedroom2+Distance*Type+Bathroom*Type+Bathroom*Rooms, data = house_clean)
tidy(mod10)
## term estimate std.error statistic p.value
## 1 (Intercept) 710162.301 26934.4256 26.366343 2.648267e-151
## 2 Typet -435324.966 43411.0994 -10.027965 1.262353e-23
## 3 Typeu -567783.778 30470.0532 -18.634158 5.120695e-77
## 4 Rooms 247765.719 8867.6353 27.940450 2.115589e-169
## 5 Bedroom2 -75058.449 7863.4404 -9.545243 1.467599e-21
## 6 Distance -42829.374 508.1733 -84.281046 0.000000e+00
## 7 Bathroom 244062.214 15023.2178 16.245668 4.546691e-59
## 8 Typet:Distance 27330.638 1951.3000 14.006374 2.032623e-44
## 9 Typeu:Distance 31870.788 1284.0686 24.820161 1.699627e-134
## 10 Typet:Bathroom -92113.081 20911.8771 -4.404821 1.062746e-05
## 11 Typeu:Bathroom -69505.799 21344.3612 -3.256401 1.129722e-03
## 12 Rooms:Bathroom 7406.735 3532.5543 2.096708 3.602862e-02
rbind(glance(mod9),glance(mod10))
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.4282488 0.4280389 485134.3 2039.936 0 11 -395364.5 790752.9
## 2 0.4283411 0.4281102 485104.1 1855.118 0 12 -395362.3 790750.5
## BIC deviance df.residual
## 1 790851.5 6.409901e+15 27235
## 2 790857.3 6.408867e+15 27234
Drop mod10 base on BIC, drop“Bathroom*Rooms“, use mod 9 to continue.
mod11 <- lm(Price~Type+Rooms+Bedroom2+Distance*Type+Bathroom*Type+Bedroom2*Rooms, data = house_clean)
tidy(mod11)
## term estimate std.error statistic p.value
## 1 (Intercept) 552136.696 26239.7239 21.042016 1.618282e-97
## 2 Typet -407161.197 43363.1481 -9.389567 6.479729e-21
## 3 Typeu -503107.825 29940.9643 -16.803327 4.797209e-63
## 4 Rooms 289335.533 8829.5610 32.768960 5.014651e-231
## 5 Bedroom2 -37998.412 10480.3632 -3.625677 2.887308e-04
## 6 Distance -43079.248 507.2622 -84.925014 0.000000e+00
## 7 Bathroom 277489.652 6554.2522 42.337348 0.000000e+00
## 8 Typet:Distance 27410.289 1949.9391 14.056998 1.000546e-44
## 9 Typeu:Distance 31367.783 1286.5930 24.380501 6.808250e-130
## 10 Typet:Bathroom -110236.934 20791.4077 -5.302043 1.154059e-07
## 11 Typeu:Bathroom -101819.823 20750.1760 -4.906938 9.304318e-07
## 12 Rooms:Bedroom2 -9658.645 1871.2777 -5.161524 2.466664e-07
rbind(glance(mod9),glance(mod11))
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.4282488 0.4280389 485134.3 2039.936 0 11 -395364.5 790752.9
## 2 0.4288076 0.4285768 484906.1 1858.655 0 12 -395351.1 790728.3
## BIC deviance df.residual
## 1 790851.5 6.409901e+15 27235
## 2 790835.1 6.403637e+15 27234
Improvement in mod11, use mod11 to continue.
mod12 <- lm(Price~Type+Rooms+Bedroom2+Distance*Type+Bathroom*Type+Bedroom2*Rooms+Distance*Rooms, data = house_clean)
tidy(mod12)
## term estimate std.error statistic p.value
## 1 (Intercept) 224859.170 29920.035 7.5153379 5.850877e-14
## 2 Typet -337732.786 43093.808 -7.8371534 4.776245e-15
## 3 Typeu -344537.284 30529.218 -11.2854933 1.799572e-29
## 4 Rooms 389081.103 9844.426 39.5229854 0.000000e+00
## 5 Bedroom2 -84467.060 10597.825 -7.9702257 1.645430e-15
## 6 Distance 1665.373 2083.953 0.7991412 4.242155e-01
## 7 Bathroom 277674.225 6496.255 42.7437397 0.000000e+00
## 8 Typet:Distance 19681.191 1964.002 10.0209618 1.354733e-23
## 9 Typeu:Distance 15400.310 1465.270 10.5102224 8.684685e-26
## 10 Typet:Bathroom -104220.654 20609.205 -5.0569952 4.286751e-07
## 11 Typeu:Bathroom -102939.387 20566.607 -5.0051712 5.616012e-07
## 12 Rooms:Bedroom2 3564.886 1948.642 1.8294210 6.734749e-02
## 13 Rooms:Distance -13233.349 598.129 -22.1245715 1.626210e-107
rbind(glance(mod11),glance(mod12))
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.4288076 0.4285768 484906.1 1858.655 0 12 -395351.1 790728.3
## 2 0.4388931 0.4386459 480614.9 1775.119 0 13 -395108.5 790244.9
## BIC deviance df.residual
## 1 790835.1 6.403637e+15 27234
## 2 790359.9 6.290568e+15 27233
Improvement in mod12, use mod12 to continue.
mod13 <- lm(Price~Type+Rooms+Bedroom2+Distance*Type+Bathroom*Type+Bedroom2*Rooms+Distance*Rooms+Bathroom*Distance, data = house_clean)
tidy(mod13)
## term estimate std.error statistic p.value
## 1 (Intercept) 156903.413 30283.7042 5.181117 2.221319e-07
## 2 Typet -358794.096 42992.3197 -8.345539 7.420634e-17
## 3 Typeu -300330.573 30625.3576 -9.806598 1.149884e-22
## 4 Rooms 341695.385 10470.4023 32.634408 3.475372e-229
## 5 Bedroom2 -82133.044 10566.8720 -7.772692 7.953696e-15
## 6 Distance 6261.587 2107.4872 2.971115 2.969797e-03
## 7 Bathroom 411439.140 12165.7279 33.819525 1.233299e-245
## 8 Typet:Distance 23678.688 1982.0238 11.946723 8.158345e-33
## 9 Typeu:Distance 15420.144 1460.7796 10.556106 5.343957e-26
## 10 Typet:Bathroom -117935.486 20573.1553 -5.732494 1.000094e-08
## 11 Typeu:Bathroom -140804.177 20709.7814 -6.798921 1.075813e-11
## 12 Rooms:Bedroom2 2969.983 1943.2093 1.528390 1.264272e-01
## 13 Rooms:Distance -8720.388 690.1405 -12.635670 1.699821e-36
## 14 Distance:Bathroom -11646.456 896.6672 -12.988605 1.847588e-38
rbind(glance(mod12),glance(mod13))
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.4388931 0.4386459 480614.9 1775.119 0 13 -395108.5 790244.9
## 2 0.4423478 0.4420816 479141.8 1661.639 0 14 -395024.3 790078.6
## BIC deviance df.residual
## 1 790359.9 6.290568e+15 27233
## 2 790201.8 6.251838e+15 27232
Improvement in mod13, use mod13 to continue.
mod14 <- lm(Price~Type+Rooms+Bedroom2+Distance*Type+Bathroom*Type+Bedroom2*Rooms+Distance*Rooms+Bathroom*Distance+Bedroom2*Distance, data = house_clean)
tidy(mod14)
## term estimate std.error statistic p.value
## 1 (Intercept) 165463.994 30948.6152 5.346410 9.043753e-08
## 2 Typet -360920.652 43020.9183 -8.389422 5.118836e-17
## 3 Typeu -304686.188 30796.6213 -9.893494 4.858561e-23
## 4 Rooms 356033.693 14963.9339 23.792787 7.297197e-124
## 5 Bedroom2 -100042.968 17028.8124 -5.874923 4.279547e-09
## 6 Distance 5021.048 2301.5042 2.181638 2.914480e-02
## 7 Bathroom 415835.033 12599.3422 33.004503 2.898006e-234
## 8 Typet:Distance 23995.092 1995.9856 12.021676 3.318019e-33
## 9 Typeu:Distance 16072.559 1539.6252 10.439267 1.832844e-25
## 10 Typet:Bathroom -118321.491 20574.8667 -5.750778 8.978324e-09
## 11 Typeu:Bathroom -141869.555 20724.7067 -6.845431 7.786301e-12
## 12 Rooms:Bedroom2 2585.407 1964.2235 1.316249 1.881017e-01
## 13 Rooms:Distance -9980.675 1165.8843 -8.560605 1.180983e-17
## 14 Distance:Bathroom -12050.539 945.9193 -12.739500 4.551563e-37
## 15 Bedroom2:Distance 1840.777 1372.5037 1.341182 1.798726e-01
rbind(glance(mod13),glance(mod14))
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.4423478 0.4420816 479141.8 1661.639 0 14 -395024.3 790078.6
## 2 0.4423846 0.4420980 479134.8 1543.124 0 15 -395023.4 790078.8
## BIC deviance df.residual
## 1 790201.8 6.251838e+15 27232
## 2 790210.2 6.251425e+15 27231
Drop mod14 base on BIC, drop “Bedroom2*Distance’, use mod13 to continue.
mod15 <- lm(Price~Type+Rooms+Bedroom2+Distance*Type+Bathroom*Type+Bedroom2*Rooms+Distance*Rooms+Bathroom*Distance+Bedroom2*Bathroom, data = house_clean)
tidy(mod15)
## term estimate std.error statistic p.value
## 1 (Intercept) 203275.449 31285.1110 6.497514 8.306751e-11
## 2 Typet -372516.655 43030.0159 -8.657135 5.100189e-18
## 3 Typeu -328808.155 30991.0095 -10.609792 3.019760e-26
## 4 Rooms 375895.628 11985.0126 31.363807 3.760122e-212
## 5 Bedroom2 -89810.691 10641.5867 -8.439596 3.340253e-17
## 6 Distance 5953.199 2106.8606 2.825626 4.722271e-03
## 7 Bathroom 314615.923 20530.3218 15.324452 8.721998e-53
## 8 Typet:Distance 23599.606 1980.8608 11.913814 1.208936e-32
## 9 Typeu:Distance 14995.610 1461.6893 10.259096 1.194389e-24
## 10 Typet:Bathroom -106391.742 20654.9891 -5.150898 2.610471e-07
## 11 Typeu:Bathroom -115308.602 21150.5933 -5.451790 5.029803e-08
## 12 Rooms:Bedroom2 -8928.799 2811.4988 -3.175814 1.495830e-03
## 13 Rooms:Distance -8367.288 692.3530 -12.085292 1.539512e-33
## 14 Distance:Bathroom -12086.729 899.2719 -13.440572 4.728247e-41
## 15 Bedroom2:Bathroom 27869.158 4761.6525 5.852833 4.888023e-09
rbind(glance(mod13),glance(mod15))
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.4423478 0.4420816 479141.8 1661.639 0 14 -395024.3 790078.6
## 2 0.4430484 0.4427621 478849.5 1547.281 0 15 -395007.2 790046.4
## BIC deviance df.residual
## 1 790201.8 6.251838e+15 27232
## 2 790177.8 6.243983e+15 27231
Improvement in mod15, use mod15 to continue.
mod16 <- lm(Price~Type+Rooms+Bedroom2+Distance*Type+Bathroom*Type+Bedroom2*Rooms+Distance*Rooms+Bathroom*Distance+Bedroom2*Bathroom+Type*Distance*Bathroom, data = house_clean)
tidy(mod16)
## term estimate std.error statistic
## 1 (Intercept) 189679.811 31506.4646 6.0203458
## 2 Typet -52785.628 91105.2170 -0.5793919
## 3 Typeu -273101.215 47643.9715 -5.7321253
## 4 Rooms 374596.304 11986.0836 31.2526023
## 5 Bedroom2 -90371.788 10640.9340 -8.4928435
## 6 Distance 7290.699 2139.8901 3.4070435
## 7 Bathroom 325683.894 20757.3553 15.6900477
## 8 Typet:Distance -7733.806 8124.4217 -0.9519208
## 9 Typeu:Distance 8362.646 4796.9940 1.7433098
## 10 Typet:Bathroom -285039.717 49342.7532 -5.7767291
## 11 Typeu:Bathroom -158331.092 35996.2689 -4.3985418
## 12 Rooms:Bedroom2 -8591.964 2812.2204 -3.0552241
## 13 Rooms:Distance -8359.437 692.1602 -12.0773140
## 14 Distance:Bathroom -12887.120 926.6322 -13.9074811
## 15 Bedroom2:Bathroom 27424.917 4761.8218 5.7593328
## 16 Typet:Distance:Bathroom 17335.574 4355.9057 3.9797863
## 17 Typeu:Distance:Bathroom 5167.350 3695.2765 1.3983662
## p.value
## 1 1.762708e-09
## 2 5.623295e-01
## 3 1.002268e-08
## 4 1.087606e-210
## 5 2.117675e-17
## 6 6.576559e-04
## 7 3.089860e-55
## 8 3.411456e-01
## 9 8.129081e-02
## 10 7.699546e-09
## 11 1.093938e-05
## 12 2.251098e-03
## 13 1.695532e-33
## 14 8.059078e-44
## 15 8.535507e-09
## 16 6.915593e-05
## 17 1.620145e-01
rbind(glance(mod15),glance(mod16))
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.4430484 0.4427621 478849.5 1547.281 0 15 -395007.2 790046.4
## 2 0.4434036 0.4430765 478714.4 1355.722 0 17 -394998.5 790033.0
## BIC deviance df.residual
## 1 790177.8 6.243983e+15 27231
## 2 790180.8 6.240001e+15 27229
Improvement base on r.squared, adj.r.squared ,deviance and AIC, just slightly increase in BIC, Overall mod 16 is better, use mod 16 to continue
mod17 <- lm(Price~Type+Rooms+Bedroom2+Distance*Type+Bathroom*Type+Bedroom2*Rooms+Distance*Rooms+Bathroom*Distance+Bedroom2*Bathroom+Type*Distance*Bathroom+Type*Distance*Rooms, data = house_clean)
tidy(mod17)
## term estimate std.error statistic
## 1 (Intercept) 153335.9339 34079.7792 4.4993230
## 2 Typet 160337.9267 110819.5919 1.4468374
## 3 Typeu -232745.3473 56814.8426 -4.0965589
## 4 Rooms 399003.2372 15222.4403 26.2115159
## 5 Bedroom2 -87543.1379 10700.6154 -8.1811312
## 6 Distance 8341.2919 2238.5944 3.7261291
## 7 Bathroom 303632.1249 22267.0071 13.6359648
## 8 Typet:Distance -27109.6059 9813.3632 -2.7625194
## 9 Typeu:Distance 9930.7861 5758.4076 1.7245716
## 10 Typet:Bathroom -199280.8888 55257.3461 -3.6064144
## 11 Typeu:Bathroom -139153.5706 38738.4534 -3.5921303
## 12 Rooms:Bedroom2 -11826.7408 3137.0716 -3.7699940
## 13 Rooms:Distance -8961.5695 756.3693 -11.8481394
## 14 Distance:Bathroom -12364.8282 952.7664 -12.9778166
## 15 Bedroom2:Bathroom 31094.7155 4987.6865 6.2342963
## 16 Typet:Rooms -131877.5737 39314.2760 -3.3544449
## 17 Typeu:Rooms -28403.6718 22053.8185 -1.2879253
## 18 Typet:Distance:Bathroom 9357.9669 4876.3165 1.9190647
## 19 Typeu:Distance:Bathroom 5924.2878 3986.1382 1.4862224
## 20 Typet:Rooms:Distance 12015.5136 3370.7415 3.5646499
## 21 Typeu:Rooms:Distance -966.2631 2166.5391 -0.4459939
## p.value
## 1 6.845210e-06
## 2 1.479540e-01
## 3 4.205493e-05
## 4 1.412490e-149
## 5 2.933378e-16
## 6 1.948357e-04
## 7 3.368379e-42
## 8 5.739564e-03
## 9 8.461606e-02
## 10 3.110122e-04
## 11 3.285635e-04
## 12 1.635962e-04
## 13 2.641787e-32
## 14 2.125431e-38
## 15 4.604711e-10
## 16 7.963290e-04
## 17 1.977829e-01
## 18 5.498659e-02
## 19 1.372319e-01
## 20 3.649657e-04
## 21 6.556052e-01
rbind(glance(mod16),glance(mod17))
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.4434036 0.4430765 478714.4 1355.722 0 17 -394998.5 790033.0
## 2 0.4438100 0.4434014 478574.7 1086.205 0 21 -394988.6 790021.1
## BIC deviance df.residual
## 1 790180.8 6.240001e+15 27229
## 2 790201.8 6.235445e+15 27225
Improvement base on r.squared, adj.r.squared ,deviance and AIC, just slightly increase in BIC, Overall mod 17 is better, use mod 17 to continue
mod18 <- lm(Price~Type+Rooms+Bedroom2+Distance*Type+Bathroom*Type+Bedroom2*Rooms+Distance*Rooms+Bathroom*Distance+Bedroom2*Bathroom+Type*Distance*Bathroom+Type*Distance*Rooms+Type*Distance*Bedroom2, data = house_clean)
tidy(mod18)
## term estimate std.error statistic
## 1 (Intercept) 169347.63521 34998.4482 4.83871840
## 2 Typet 200533.59581 125923.3403 1.59250537
## 3 Typeu -252987.55105 62384.0937 -4.05532142
## 4 Rooms 424402.45478 20818.0364 20.38628650
## 5 Bedroom2 -121514.98986 20698.8640 -5.87061155
## 6 Distance 7118.20546 2399.5166 2.96651643
## 7 Bathroom 310488.20337 22664.7750 13.69915223
## 8 Typet:Distance -30174.33867 11309.9352 -2.66794974
## 9 Typeu:Distance 9456.36195 6257.1535 1.51128815
## 10 Typet:Bathroom -182834.68192 60324.6025 -3.03084768
## 11 Typeu:Bathroom -143315.26472 41169.9793 -3.48106235
## 12 Rooms:Bedroom2 -11672.78329 3177.1618 -3.67396561
## 13 Rooms:Distance -10974.27962 1437.1843 -7.63595827
## 14 Distance:Bathroom -12811.64414 992.5511 -12.90779255
## 15 Bedroom2:Bathroom 30802.12643 5000.7351 6.15951971
## 16 Typet:Rooms -123566.64680 54747.1964 -2.25704063
## 17 Typeu:Rooms -60955.76448 30861.3012 -1.97515212
## 18 Typet:Bedroom2 -34357.99203 73702.5364 -0.46617109
## 19 Typeu:Bedroom2 41866.78125 40824.4789 1.02553131
## 20 Bedroom2:Distance 2620.30853 1622.3557 1.61512580
## 21 Typet:Distance:Bathroom 8052.68241 5425.1020 1.48433751
## 22 Typeu:Distance:Bathroom 4690.60279 4356.8087 1.07661436
## 23 Typet:Rooms:Distance 11484.61820 4569.3704 2.51339184
## 24 Typeu:Rooms:Distance 182.43529 2865.9145 0.06365692
## 25 Typet:Bedroom2:Distance 2556.61191 6459.5486 0.39578802
## 26 Typeu:Bedroom2:Distance 82.77812 3974.4372 0.02082763
## p.value
## 1 1.313938e-06
## 2 1.112828e-01
## 3 5.020473e-05
## 4 1.072327e-91
## 5 4.392235e-09
## 6 3.014559e-03
## 7 1.422215e-42
## 8 7.636086e-03
## 9 1.307267e-01
## 10 2.440973e-03
## 11 5.002116e-04
## 12 2.392727e-04
## 13 2.314918e-14
## 14 5.259846e-38
## 15 7.398697e-10
## 16 2.401343e-02
## 17 4.826097e-02
## 18 6.410968e-01
## 19 3.051217e-01
## 20 1.062950e-01
## 21 1.377310e-01
## 22 2.816621e-01
## 23 1.196337e-02
## 24 9.492439e-01
## 25 6.922645e-01
## 26 9.833833e-01
rbind(glance(mod17),glance(mod18))
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.4438100 0.4434014 478574.7 1086.2050 0 21 -394988.6 790021.1
## 2 0.4439511 0.4434404 478558.0 869.3012 0 26 -394985.1 790024.2
## BIC deviance df.residual
## 1 790201.8 6.235445e+15 27225
## 2 790245.9 6.233863e+15 27220
Not significant improvement in mod 18, drop“TypeDistanceBedroom2”, use mod 17 to continue.
mod19 <- lm(Price~Type+Rooms+Bedroom2+Distance*Type+Bathroom*Type+Bedroom2*Rooms+Distance*Rooms+Bathroom*Distance+Bedroom2*Bathroom+Type*Distance*Bathroom+Type*Distance*Rooms+Type*Distance*Rooms*Bathroom, data = house_clean)
tidy(mod19)
## term estimate std.error statistic
## 1 (Intercept) 197037.4274 60078.7490 3.2796526
## 2 Typet 265137.9541 327949.0300 0.8084731
## 3 Typeu -125526.2923 166884.8201 -0.7521732
## 4 Rooms 306849.2792 23488.1264 13.0640169
## 5 Bedroom2 45898.4173 18206.4912 2.5209919
## 6 Distance 7502.2076 4285.7436 1.7505032
## 7 Bathroom 176559.6460 34718.5340 5.0854580
## 8 Typet:Distance -33703.2011 29323.4308 -1.1493608
## 9 Typeu:Distance -11774.1197 18952.2137 -0.6212530
## 10 Typet:Bathroom -222181.2079 186939.6673 -1.1885183
## 11 Typeu:Bathroom -217065.2160 134803.6052 -1.6102330
## 12 Rooms:Bedroom2 -26792.1267 3675.0170 -7.2903408
## 13 Rooms:Distance -8788.5811 1278.0110 -6.8767648
## 14 Distance:Bathroom -11794.7506 2224.2625 -5.3027692
## 15 Bedroom2:Bathroom -18067.2372 7315.8567 -2.4695997
## 16 Typet:Rooms -188077.6573 124661.0062 -1.5087128
## 17 Typeu:Rooms -86757.1551 67336.5972 -1.2884101
## 18 Rooms:Bathroom 83966.4173 11001.1610 7.6325050
## 19 Typet:Distance:Bathroom 12716.8594 16707.7471 0.7611355
## 20 Typeu:Distance:Bathroom 23288.5469 15676.3234 1.4855873
## 21 Typet:Rooms:Distance 15148.1184 10866.3374 1.3940409
## 22 Typeu:Rooms:Distance 7739.1019 7006.6495 1.1045367
## 23 Typet:Rooms:Bathroom 20254.1479 66564.3271 0.3042793
## 24 Typeu:Rooms:Bathroom 44063.9855 51836.9421 0.8500499
## 25 Rooms:Distance:Bathroom -100.8202 538.4634 -0.1872368
## 26 Typet:Rooms:Distance:Bathroom -1634.1501 5823.8502 -0.2805962
## 27 Typeu:Rooms:Distance:Bathroom -6766.0006 5515.4419 -1.2267377
## p.value
## 1 1.040655e-03
## 2 4.188254e-01
## 3 4.519534e-01
## 4 6.921665e-39
## 5 1.170812e-02
## 6 8.004280e-02
## 7 3.691711e-07
## 8 2.504173e-01
## 9 5.344384e-01
## 10 2.346397e-01
## 11 1.073586e-01
## 12 3.175949e-13
## 13 6.254998e-12
## 14 1.149484e-07
## 15 1.353252e-02
## 16 1.313838e-01
## 17 1.976142e-01
## 18 2.377664e-14
## 19 4.465827e-01
## 20 1.373999e-01
## 21 1.633166e-01
## 22 2.693701e-01
## 23 7.609174e-01
## 24 3.953048e-01
## 25 8.514763e-01
## 26 7.790223e-01
## 27 2.199318e-01
rbind(glance(mod17),glance(mod19))
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.4438100 0.4434014 478574.7 1086.2050 0 21 -394988.6 790021.1
## 2 0.4455557 0.4450261 477875.8 841.2844 0 27 -394945.7 789947.5
## BIC deviance df.residual
## 1 790201.8 6.235445e+15 27225
## 2 790177.4 6.215874e+15 27219
Improvement in mod19 base on r.squared, adj.r.squared ,deviance, AIC and BIC.
mod20 <- lm(Price~Type*Distance*Rooms*Bathroom*Bedroom2, data = house_clean)
tidy(mod20)
## term estimate std.error
## 1 (Intercept) 1651739.647 128981.0113
## 2 Typet -418833.006 1044812.1727
## 3 Typeu -838031.353 556957.3610
## 4 Distance -91729.663 10666.6232
## 5 Rooms -234309.878 59712.8659
## 6 Bathroom -564195.797 73732.1048
## 7 Bedroom2 -194602.888 60362.9576
## 8 Typet:Distance 80383.701 100787.2013
## 9 Typeu:Distance 2940.536 57139.3170
## 10 Typet:Rooms -75893.890 751899.4101
## 11 Typeu:Rooms 425971.688 286761.3016
## 12 Distance:Rooms 27353.467 4599.4055
## 13 Typet:Bathroom 264978.088 690827.1483
## 14 Typeu:Bathroom -40276.510 525677.2145
## 15 Distance:Bathroom 27100.675 5021.4045
## 16 Rooms:Bathroom 369138.688 31599.1885
## 17 Typet:Bedroom2 -22875.230 758403.8631
## 18 Typeu:Bedroom2 -332726.024 268823.3287
## 19 Distance:Bedroom2 19785.954 4963.1231
## 20 Rooms:Bedroom2 72074.133 10973.3148
## 21 Bathroom:Bedroom2 56597.359 28428.0905
## 22 Typet:Distance:Rooms -3856.725 78232.8292
## 23 Typeu:Distance:Rooms -14161.389 25388.9501
## 24 Typet:Distance:Bathroom -46977.940 67662.6185
## 25 Typeu:Distance:Bathroom 42104.014 52067.5833
## 26 Typet:Rooms:Bathroom -67525.277 414527.5629
## 27 Typeu:Rooms:Bathroom -218570.097 249424.0104
## 28 Distance:Rooms:Bathroom -16926.380 2307.5876
## 29 Typet:Distance:Bedroom2 -38315.488 80037.6443
## 30 Typeu:Distance:Bedroom2 36558.705 31430.4382
## 31 Typet:Rooms:Bedroom2 40057.516 145714.4692
## 32 Typeu:Rooms:Bedroom2 12454.291 86747.1302
## 33 Distance:Rooms:Bedroom2 -7438.830 872.9480
## 34 Typet:Bathroom:Bedroom2 -6923.578 430922.0786
## 35 Typeu:Bathroom:Bedroom2 353715.783 235387.8019
## 36 Distance:Bathroom:Bedroom2 -3299.722 2099.3486
## 37 Rooms:Bathroom:Bedroom2 -40456.694 3073.4865
## 38 Typet:Distance:Rooms:Bathroom 7732.189 43297.8940
## 39 Typeu:Distance:Rooms:Bathroom 1320.284 21569.5432
## 40 Typet:Distance:Rooms:Bedroom2 6251.990 13347.4182
## 41 Typeu:Distance:Rooms:Bedroom2 -6211.043 8127.4193
## 42 Typet:Distance:Bathroom:Bedroom2 19057.655 44888.7817
## 43 Typeu:Distance:Bathroom:Bedroom2 -35278.299 26753.1730
## 44 Typet:Rooms:Bathroom:Bedroom2 -16475.944 80852.3486
## 45 Typeu:Rooms:Bathroom:Bedroom2 -42218.691 71071.7925
## 46 Distance:Rooms:Bathroom:Bedroom2 2417.957 203.9803
## 47 Typet:Distance:Rooms:Bathroom:Bedroom2 -2866.244 7533.5852
## 48 Typeu:Distance:Rooms:Bathroom:Bedroom2 6554.167 6354.0599
## statistic p.value
## 1 12.80606835 1.945376e-37
## 2 -0.40086919 6.885196e-01
## 3 -1.50465980 1.324232e-01
## 4 -8.59969090 8.415802e-18
## 5 -3.92394293 8.732553e-05
## 6 -7.65196924 2.044701e-14
## 7 -3.22387929 1.266158e-03
## 8 0.79755862 4.251336e-01
## 9 0.05146257 9.589573e-01
## 10 -0.10093623 9.196018e-01
## 11 1.48545737 1.374343e-01
## 12 5.94717444 2.761402e-09
## 13 0.38356641 7.013028e-01
## 14 -0.07661833 9.389277e-01
## 15 5.39703086 6.831887e-08
## 16 11.68190405 1.875305e-31
## 17 -0.03016233 9.759378e-01
## 18 -1.23771261 2.158333e-01
## 19 3.98659349 6.720369e-05
## 20 6.56812770 5.187071e-11
## 21 1.99089555 4.650236e-02
## 22 -0.04929804 9.606821e-01
## 23 -0.55777767 5.770008e-01
## 24 -0.69429680 4.875020e-01
## 25 0.80864161 4.187284e-01
## 26 -0.16289695 8.706008e-01
## 27 -0.87629934 3.808751e-01
## 28 -7.33509747 2.277499e-13
## 29 -0.47871834 6.321429e-01
## 30 1.16316243 2.447738e-01
## 31 0.27490418 7.833919e-01
## 32 0.14357006 8.858410e-01
## 33 -8.52150365 1.655217e-17
## 34 -0.01606689 9.871811e-01
## 35 1.50269377 1.329296e-01
## 36 -1.57178371 1.160124e-01
## 37 -13.16312734 1.888745e-39
## 38 0.17858118 8.582679e-01
## 39 0.06121058 9.511920e-01
## 40 0.46840445 6.394991e-01
## 41 -0.76420855 4.447496e-01
## 42 0.42455274 6.711661e-01
## 43 -1.31865851 1.872944e-01
## 44 -0.20377818 8.385284e-01
## 45 -0.59402880 5.524978e-01
## 46 11.85387573 2.468299e-32
## 47 -0.38046210 7.036054e-01
## 48 1.03149277 3.023190e-01
rbind(glance(mod19),glance(mod20))
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.4455557 0.4450261 477875.8 841.2844 0 27 -394945.7 789947.5
## 2 0.4498955 0.4489449 476185.6 473.2662 0 48 -394838.7 789775.3
## BIC deviance df.residual
## 1 790177.4 6.215874e+15 27219
## 2 790177.8 6.167221e+15 27198
Basae on r.squared, adj.r.squared ,deviance, AIC and BIC, mod20 is the best model, but a few predictiors in the mod20 model is not significant.